{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "rtox72csOQUN"
   },
   "source": [
    "# DeepMatch 样例代码\n",
    "- https://github.com/shenweichen/DeepMatch\n",
    "- https://deepmatch.readthedocs.io/en/latest/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "bTWHz-heMkyw"
   },
   "source": [
    "# 下载movielens-1M数据 安装依赖包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 13146,
     "status": "ok",
     "timestamp": 1656786953325,
     "user": {
      "displayName": "沈伟臣",
      "userId": "00399522274399293678"
     },
     "user_tz": -480
    },
    "id": "yTl6d6jO1oqf",
    "outputId": "16a19888-344e-4dbe-c723-effe370222f7"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2022-07-02 18:35:40--  http://files.grouplens.org/datasets/movielens/ml-1m.zip\n",
      "Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152\n",
      "Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 5917549 (5.6M) [application/zip]\n",
      "Saving to: ‘./ml-1m.zip’\n",
      "\n",
      "./ml-1m.zip         100%[===================>]   5.64M  3.43MB/s    in 1.6s    \n",
      "\n",
      "2022-07-02 18:35:42 (3.43 MB/s) - ‘./ml-1m.zip’ saved [5917549/5917549]\n",
      "\n",
      "--2022-07-02 18:35:42--  https://raw.githubusercontent.com/shenweichen/DeepMatch/master/examples/preprocess.py\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 5642 (5.5K) [text/plain]\n",
      "Saving to: ‘preprocess.py’\n",
      "\n",
      "preprocess.py       100%[===================>]   5.51K  --.-KB/s    in 0s      \n",
      "\n",
      "2022-07-02 18:35:42 (75.7 MB/s) - ‘preprocess.py’ saved [5642/5642]\n",
      "\n",
      "Archive:  ml-1m.zip\n",
      "  inflating: ml-1m/movies.dat        \n",
      "  inflating: ml-1m/ratings.dat       \n",
      "  inflating: ml-1m/README            \n",
      "  inflating: ml-1m/users.dat         \n",
      "\u001b[33mWARNING: Skipping tensorflow as it is not installed.\u001b[0m\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "deepctr 0.9.1 requires h5py==2.10.0, but you have h5py 3.1.0 which is incompatible.\u001b[0m\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "tensorflow-gpu 2.5.0 requires h5py~=3.1.0, but you have h5py 2.10.0 which is incompatible.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./ml-1m.zip \n",
    "! wget https://raw.githubusercontent.com/shenweichen/DeepMatch/master/examples/preprocess.py -O preprocess.py\n",
    "! unzip -o ml-1m.zip \n",
    "! pip uninstall -y -q tensorflow\n",
    "! pip install -q tensorflow-gpu==2.5.0\n",
    "! pip install -q deepmatch"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "p9UxNHuPMuW2"
   },
   "source": [
    "# 导入需要的库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "executionInfo": {
     "elapsed": 3234,
     "status": "ok",
     "timestamp": 1656786956552,
     "user": {
      "displayName": "沈伟臣",
      "userId": "00399522274399293678"
     },
     "user_tz": -480
    },
    "id": "C_ZR6gzp1E2N"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from deepctr.feature_column import SparseFeat, VarLenSparseFeat\n",
    "from preprocess import gen_data_set, gen_model_input\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from tensorflow.python.keras import backend as K\n",
    "from tensorflow.python.keras.models import Model\n",
    "\n",
    "from deepmatch.models import *\n",
    "from deepmatch.utils import sampledsoftmaxloss, NegativeSampler"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "fQq6O9XAMzPF"
   },
   "source": [
    "# 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 5508,
     "status": "ok",
     "timestamp": 1656786962055,
     "user": {
      "displayName": "沈伟臣",
      "userId": "00399522274399293678"
     },
     "user_tz": -480
    },
    "id": "lcO29zFb21Od",
    "outputId": "9c60dffa-5829-40bc-c54d-cde4ae2bea72"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
      "  after removing the cwd from sys.path.\n",
      "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
      "  \n",
      "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "data_path = \"./\"\n",
    "\n",
    "unames = ['user_id','gender','age','occupation','zip']\n",
    "user = pd.read_csv(data_path+'ml-1m/users.dat',sep='::',header=None,names=unames)\n",
    "rnames = ['user_id','movie_id','rating','timestamp']\n",
    "ratings = pd.read_csv(data_path+'ml-1m/ratings.dat',sep='::',header=None,names=rnames)\n",
    "mnames = ['movie_id','title','genres']\n",
    "movies = pd.read_csv(data_path+'ml-1m/movies.dat',sep='::',header=None,names=mnames,encoding=\"unicode_escape\")\n",
    "movies['genres'] = list(map(lambda x: x.split('|')[0], movies['genres'].values))\n",
    "\n",
    "data = pd.merge(pd.merge(ratings,movies),user)#.iloc[:10000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "L0yCWxQxM3se"
   },
   "source": [
    "# 构建特征列，训练模型，导出embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 400136,
     "status": "ok",
     "timestamp": 1656787362187,
     "user": {
      "displayName": "沈伟臣",
      "userId": "00399522274399293678"
     },
     "user_tz": -480
    },
    "id": "BMOvk_de2ML3",
    "outputId": "2c5ac944-b866-4a63-be70-fedc9c257812"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6040/6040 [00:12<00:00, 482.49it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8 8\n",
      "Train on 988129 samples\n",
      "Epoch 1/20\n",
      "988129/988129 [==============================] - 44s 44us/sample - loss: 4.7422\n",
      "Epoch 2/20\n",
      "988129/988129 [==============================] - 47s 47us/sample - loss: 4.3175\n",
      "Epoch 3/20\n",
      "988129/988129 [==============================] - 52s 53us/sample - loss: 4.1604\n",
      "Epoch 4/20\n",
      "988129/988129 [==============================] - 56s 56us/sample - loss: 4.0652\n",
      "Epoch 5/20\n",
      "988129/988129 [==============================] - 53s 54us/sample - loss: 3.9986\n",
      "Epoch 6/20\n",
      "988129/988129 [==============================] - 51s 52us/sample - loss: 3.9487\n",
      "Epoch 7/20\n",
      "988129/988129 [==============================] - 45s 45us/sample - loss: 3.9083\n",
      "Epoch 8/20\n",
      "988129/988129 [==============================] - 44s 45us/sample - loss: 3.8760\n",
      "Epoch 9/20\n",
      "988129/988129 [==============================] - 51s 51us/sample - loss: 3.8502\n",
      "Epoch 10/20\n",
      "988129/988129 [==============================] - 51s 52us/sample - loss: 3.8282\n",
      "Epoch 11/20\n",
      "988129/988129 [==============================] - 54s 55us/sample - loss: 3.8102\n",
      "Epoch 12/20\n",
      "988129/988129 [==============================] - 51s 51us/sample - loss: 3.7938\n",
      "Epoch 13/20\n",
      "988129/988129 [==============================] - 41s 41us/sample - loss: 3.7804\n",
      "Epoch 14/20\n",
      "988129/988129 [==============================] - 48s 48us/sample - loss: 3.7676\n",
      "Epoch 15/20\n",
      "988129/988129 [==============================] - 65s 65us/sample - loss: 3.7577\n",
      "Epoch 16/20\n",
      "988129/988129 [==============================] - 53s 53us/sample - loss: 3.7482\n",
      "Epoch 17/20\n",
      "988129/988129 [==============================] - 50s 50us/sample - loss: 3.7394\n",
      "Epoch 18/20\n",
      "988129/988129 [==============================] - 56s 57us/sample - loss: 3.7311\n",
      "Epoch 19/20\n",
      "988129/988129 [==============================] - 52s 53us/sample - loss: 3.7238\n",
      "Epoch 20/20\n",
      "988129/988129 [==============================] - 56s 57us/sample - loss: 3.7180\n",
      "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n",
      "(6040, 32)\n",
      "(3706, 32)\n"
     ]
    }
   ],
   "source": [
    "#data = pd.read_csvdata = pd.read_csv(\"./movielens_sample.txt\")\n",
    "sparse_features = [\"movie_id\", \"user_id\",\n",
    "                    \"gender\", \"age\", \"occupation\", \"zip\", \"genres\"]\n",
    "SEQ_LEN = 50\n",
    "negsample = 0\n",
    "\n",
    "# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`\n",
    "\n",
    "#features = ['user_id','movie_id','gender', 'age', 'occupation', 'zip']\n",
    "feature_max_idx = {}\n",
    "for feature in sparse_features:\n",
    "    lbe = LabelEncoder()\n",
    "    data[feature] = lbe.fit_transform(data[feature]) + 1\n",
    "    feature_max_idx[feature] = data[feature].max() + 1\n",
    "\n",
    "user_profile = data[[\"user_id\", \"gender\", \"age\", \"occupation\", \"zip\"]].drop_duplicates('user_id')\n",
    "\n",
    "item_profile = data[[\"movie_id\",\"genres\"]].drop_duplicates('movie_id')\n",
    "\n",
    "user_profile.set_index(\"user_id\", inplace=True)\n",
    "\n",
    "user_item_list = data.groupby(\"user_id\")['movie_id'].apply(list)\n",
    "\n",
    "train_set, test_set = gen_data_set(data, SEQ_LEN, negsample)\n",
    "\n",
    "train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)\n",
    "test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)\n",
    "\n",
    "# 2.count #unique features for each sparse field and generate feature config for sequence feature\n",
    "\n",
    "embedding_dim = 32\n",
    "\n",
    "user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),\n",
    "                        SparseFeat(\"gender\", feature_max_idx['gender'], 16),\n",
    "                        SparseFeat(\"age\", feature_max_idx['age'], 16),\n",
    "                        SparseFeat(\"occupation\", feature_max_idx['occupation'], 16),\n",
    "                        SparseFeat(\"zip\", feature_max_idx['zip'], 16),\n",
    "                        VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,\n",
    "                                                    embedding_name=\"movie_id\"), SEQ_LEN, 'mean', 'hist_len'),\n",
    "                        VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim,\n",
    "                                embedding_name=\"genres\"), SEQ_LEN, 'mean', 'hist_len'),\n",
    "                        ]\n",
    "\n",
    "item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim),\n",
    "                        SparseFeat('genres', feature_max_idx['genres'], embedding_dim)\n",
    "                       ]\n",
    "\n",
    "from collections import Counter\n",
    "train_counter = Counter(train_model_input['movie_id'])\n",
    "item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)]\n",
    "sampler_config = NegativeSampler('inbatch',num_sampled=255,item_name=\"movie_id\",item_count=item_count)\n",
    "\n",
    "# 3.Define Model and train\n",
    "\n",
    "import tensorflow as tf\n",
    "if tf.__version__ >= '2.0.0':\n",
    "    tf.compat.v1.disable_eager_execution()\n",
    "else:\n",
    "    K.set_learning_phase(True)\n",
    "    \n",
    "model = DSSM(user_feature_columns, item_feature_columns,user_dnn_hidden_units=(128,64, embedding_dim),\n",
    "             item_dnn_hidden_units=(64, embedding_dim,),loss_type='softmax',sampler_config=sampler_config)\n",
    "\n",
    "model.compile(optimizer=\"adam\", loss=sampledsoftmaxloss)\n",
    "\n",
    "history = model.fit(train_model_input, train_label,  # train_label,\n",
    "                    batch_size=256, epochs=20, verbose=1, validation_split=0.0, )\n",
    "\n",
    "# 4. Generate user features for testing and full item features for retrieval\n",
    "test_user_model_input = test_model_input\n",
    "all_item_model_input = {\"movie_id\": item_profile['movie_id'].values,\"genres\": item_profile['genres'].values}\n",
    "\n",
    "user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)\n",
    "item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)\n",
    "\n",
    "user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)\n",
    "# user_embs = user_embs[:, i, :]  # i in [0,k_max) if MIND\n",
    "item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)\n",
    "\n",
    "print(user_embs.shape)\n",
    "print(item_embs.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "w_G3KWslKmJo"
   },
   "source": [
    "# 使用faiss进行ANN查找并评估结果"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "5SvyQLNVKkcs"
   },
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 2988,
     "status": "ok",
     "timestamp": 1656787365168,
     "user": {
      "displayName": "沈伟臣",
      "userId": "00399522274399293678"
     },
     "user_tz": -480
    },
    "id": "j2ZNYNBOOqrN",
    "outputId": "7c66659a-582f-4747-e025-5f6c6253c15f"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
      "Requirement already satisfied: faiss-cpu in /usr/local/lib/python3.7/dist-packages (1.7.2)\n"
     ]
    }
   ],
   "source": [
    "! pip install faiss-cpu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 3764,
     "status": "ok",
     "timestamp": 1656787368915,
     "user": {
      "displayName": "沈伟臣",
      "userId": "00399522274399293678"
     },
     "user_tz": -480
    },
    "id": "6TY1l27iJU8U",
    "outputId": "6e842add-350e-41d8-8e8b-fd4652e74e17"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "6040it [00:02, 2487.51it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "recall 0.3554635761589404\n",
      "hit rate 0.3554635761589404\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "test_true_label = {line[0]:[line[1]] for line in test_set}\n",
    "\n",
    "import numpy as np\n",
    "import faiss\n",
    "from tqdm import tqdm\n",
    "from deepmatch.utils import recall_N\n",
    "\n",
    "index = faiss.IndexFlatIP(embedding_dim)\n",
    "# faiss.normalize_L2(item_embs)\n",
    "index.add(item_embs)\n",
    "# faiss.normalize_L2(user_embs)\n",
    "D, I = index.search(np.ascontiguousarray(user_embs), 50)\n",
    "s = []\n",
    "hit = 0\n",
    "for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):\n",
    "    try:\n",
    "        pred = [item_profile['movie_id'].values[x] for x in I[i]]\n",
    "        filter_item = None\n",
    "        recall_score = recall_N(test_true_label[uid], pred, N=50)\n",
    "        s.append(recall_score)\n",
    "        if test_true_label[uid] in pred:\n",
    "            hit += 1\n",
    "    except:\n",
    "        print(i)\n",
    "print(\"\")\n",
    "print(\"recall\", np.mean(s))\n",
    "print(\"hit rate\", hit / len(test_user_model_input['user_id']))"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "colab_MovieLen1M_YoutubeDNN.ipynb",
   "provenance": [],
   "toc_visible": true
  },
  "gpuClass": "standard",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
